# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install()
# BiocManager::install("biomaRt")
# install.packages("tidyverse")
# install.packages("readxl")
# BiocManager::install("ComplexHeatmap")
# install.packages("matrixStats")
# install.packages("pheatmap")
# install.packages("RVAideMemoire")
# install.packages("dendextend")
# install.packages("binom")
# BiocManager::install("DESeq2")
# install.packages("corrplot")
# BiocManager::install("apeglm")
# install.packages("ashr")
Load necessary packages
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.1
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## corrplot 0.92 loaded
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks IRanges::slice()
## Warning: multiple methods tables found for 'union'
## Warning: multiple methods tables found for 'intersect'
##
## Attaching package: 'InterMineR'
## The following objects are masked from 'package:dplyr':
##
## intersect, union
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## intersect, union
## The following objects are masked from 'package:S4Vectors':
##
## intersect, union
## The following objects are masked from 'package:BiocGenerics':
##
## intersect, union
## The following objects are masked from 'package:base':
##
## intersect, union
Read in the counts data
countsData <- read.delim(file = "../01_input/RWP27_Embryo_Intestine_FACS_200923_all.counts", sep = " ")
head(countsData)
## chr start stop strand length embryoCells_rep1
## WBGene00014450 MtDNA 1 55 + 55 0
## WBGene00014451 MtDNA 58 111 + 54 0
## WBGene00010957 MtDNA 113 549 + 437 0
## WBGene00010958 MtDNA 549 783 + 235 0
## WBGene00014452 MtDNA 785 840 + 56 0
## WBGene00014453 MtDNA 842 896 + 55 0
## embryoGFPminus_rep2 embryoGFPplus_rep2 embryoWhole_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoCells_rep2 embryoGFPminus_rep3 embryoGFPplus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoGFPminus_rep1 embryoGFPplus_rep1 embryoWhole_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
colnames(countsData[6:15])
## [1] "embryoCells_rep1" "embryoGFPminus_rep2" "embryoGFPplus_rep2"
## [4] "embryoWhole_rep3" "embryoCells_rep2" "embryoGFPminus_rep3"
## [7] "embryoGFPplus_rep3" "embryoGFPminus_rep1" "embryoGFPplus_rep1"
## [10] "embryoWhole_rep2"
Read in and print metadata file
metadata1 <- read.table(file = "../01_input/RWP27_metadata.tsv", header = FALSE, stringsAsFactors = FALSE)
colnames(metadata1) <- c("Filename.Fwd", "Filename.Rev", "names")
rep <- c(1,1,1,2,2,2,2,3,3,3)
type <- c("cells", "gut", "gutless", "whole", "cells", "gut", "gutless", "whole", "gut", "gutless")
metadata1 <- cbind(metadata1, rep, type)
metadata1
## Filename.Fwd Filename.Rev names rep type
## 1 RW57_S10_L003_R1_001 RW57_S10_L003_R2_001 embryoCells_rep1 1 cells
## 2 RW58_S11_L003_R1_001 RW58_S11_L003_R2_001 embryoGFPplus_rep1 1 gut
## 3 RW59_S12_L003_R1_001 RW59_S12_L003_R2_001 embryoGFPminus_rep1 1 gutless
## 4 RW60_S13_L003_R1_001 RW60_S13_L003_R2_001 embryoWhole_rep2 2 whole
## 5 RW61_S14_L003_R1_001 RW61_S14_L003_R2_001 embryoCells_rep2 2 cells
## 6 RW62_S15_L003_R1_001 RW62_S15_L003_R2_001 embryoGFPplus_rep2 2 gut
## 7 RW63_S16_L003_R1_001 RW63_S16_L003_R2_001 embryoGFPminus_rep2 2 gutless
## 8 RW64_S17_L003_R1_001 RW64_S17_L003_R2_001 embryoWhole_rep3 3 whole
## 9 RW65_S18_L003_R1_001 RW65_S18_L003_R2_001 embryoGFPplus_rep3 3 gut
## 10 RW66_S19_L003_R1_001 RW66_S19_L003_R2_001 embryoGFPminus_rep3 3 gutless
Factorize metadata
metadata1$names <- factor(
metadata1$names,
levels = metadata1$names
)
metadata1$type <-
factor(metadata1$type, levels = c("gutless", "gut", "cells", "whole"))
Order columns according to metadata1 order
countsData <- countsData %>% select(chr:length, sort(metadata1$names))
head(countsData)
## chr start stop strand length embryoCells_rep1
## WBGene00014450 MtDNA 1 55 + 55 0
## WBGene00014451 MtDNA 58 111 + 54 0
## WBGene00010957 MtDNA 113 549 + 437 0
## WBGene00010958 MtDNA 549 783 + 235 0
## WBGene00014452 MtDNA 785 840 + 56 0
## WBGene00014453 MtDNA 842 896 + 55 0
## embryoGFPplus_rep1 embryoGFPminus_rep1 embryoWhole_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoCells_rep2 embryoGFPplus_rep2 embryoGFPminus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoWhole_rep3 embryoGFPplus_rep3 embryoGFPminus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
Generate a table called “cts” out of the countsData table. Subset the countsData.
cts <- as.matrix(countsData %>% select(metadata1$names))
head(cts)
## embryoCells_rep1 embryoGFPplus_rep1 embryoGFPminus_rep1
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoWhole_rep2 embryoCells_rep2 embryoGFPplus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoGFPminus_rep2 embryoWhole_rep3 embryoGFPplus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryoGFPminus_rep3
## WBGene00014450 0
## WBGene00014451 0
## WBGene00010957 0
## WBGene00010958 0
## WBGene00014452 0
## WBGene00014453 0
Reorganize the metadata table so the names2 column are now headers
rownames(metadata1)<- metadata1$names
coldata <- metadata1[,c("names", "rep", "type")]
rownames(coldata) <- as.vector(metadata1$names)
coldata
## names rep type
## embryoCells_rep1 embryoCells_rep1 1 cells
## embryoGFPplus_rep1 embryoGFPplus_rep1 1 gut
## embryoGFPminus_rep1 embryoGFPminus_rep1 1 gutless
## embryoWhole_rep2 embryoWhole_rep2 2 whole
## embryoCells_rep2 embryoCells_rep2 2 cells
## embryoGFPplus_rep2 embryoGFPplus_rep2 2 gut
## embryoGFPminus_rep2 embryoGFPminus_rep2 2 gutless
## embryoWhole_rep3 embryoWhole_rep3 3 whole
## embryoGFPplus_rep3 embryoGFPplus_rep3 3 gut
## embryoGFPminus_rep3 embryoGFPminus_rep3 3 gutless
Check that the names match –> Should be TRUE
all(rownames(coldata) == colnames(cts))
## [1] TRUE
Determine filtering threshold, identify read count threshold for non-intestine specific genes
tissue_specific_genes <- read_csv(file = "../01_input/tissue_specific_genes_220202.csv", show_col_types = FALSE)
head(tissue_specific_genes)
## # A tibble: 6 × 3
## WBGeneID Sequence.name tissue
## <chr> <chr> <chr>
## 1 WBGene00000007 aat-6 intestine
## 2 WBGene00000008 aat-7 intestine
## 3 WBGene00000016 abf-5 nervous-system
## 4 WBGene00000031 abu-8 intestine
## 5 WBGene00000034 abu-11 nervous-system
## 6 WBGene00000038 ace-4 nervous-system
cts_long <- as.data.frame(cts) %>% rownames_to_column(var = "WBGeneID") %>%
tidyr::pivot_longer(cols = embryoCells_rep1:embryoGFPminus_rep3, values_to = "reads") %>%
tidyr::separate(name, sep = "_", into = c("sample_type", "rep"))
cts_long_summary <- cts_long %>%
group_by(sample_type, WBGeneID) %>%
summarise(mean = mean(reads), variance = var(reads))
## `summarise()` has grouped output by 'sample_type'. You can override using the
## `.groups` argument.
cts_long_summary %>% ggplot(aes(x = log(mean), y = log(variance))) +
geom_point(alpha = 0.01) +
facet_wrap(~sample_type)
cts_long_summary %>%
left_join(tissue_specific_genes, by = "WBGeneID") %>%
mutate(tissue = replace_na(tissue, "not_specific")) %>%
ggplot(aes(x = log(mean), fill = sample_type)) +
geom_density(kernel = "gaussian", alpha = 0.25) +
facet_grid(~tissue)
## Warning: Removed 92003 rows containing non-finite values (stat_density).
Generate the DESeqDataSet. The variables in this design formula will be the type of sample, and the preparation date. This should reduce the variability between the samples based on when they were made.
From the vignette: “In order to benefit from the default settings of the package, you should put the variable of interest at the end of the formula and make sure the control level is the first level.”
The variable of interest is the sample type.
Using DESeqDataSetFromMatrix since I used the program featureCounts.
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ type)
Visualize read count distribution
hist(log(rowSums(counts(dds))))
abline(v = log(10), col = "red", lty = 2)
Filter genes with low read counts
keep <- rowSums(counts(dds)) >=10
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 22754 10
## metadata(1): version
## assays(1): counts
## rownames(22754): WBGene00021406 WBGene00021407 ... WBGene00199694
## WBGene00044951
## rowData names(0):
## colnames(10): embryoCells_rep1 embryoGFPplus_rep1 ...
## embryoGFPplus_rep3 embryoGFPminus_rep3
## colData names(3): names rep type
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "type_gut_vs_gutless" "type_cells_vs_gutless"
## [4] "type_whole_vs_gutless"
res <- results(dds, contrast = c("type", "gut", "gutless"))
head(res)
## log2 fold change (MLE): type gut vs gutless
## Wald test p-value: type gut vs gutless
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## WBGene00021406 114.30007 1.064520 0.64308 1.655346 9.78543e-02
## WBGene00021407 16.09971 0.559255 1.43107 0.390794 6.95949e-01
## WBGene00021408 19.52792 7.933028 1.53015 5.184491 2.16606e-07
## WBGene00021405 2.27976 4.557575 2.27585 2.002584 4.52219e-02
## WBGene00021409 2.40590 5.185917 3.06609 1.691378 9.07647e-02
## WBGene00021404 20.94007 8.493339 1.43841 5.904675 3.53343e-09
## padj
## <numeric>
## WBGene00021406 1.89076e-01
## WBGene00021407 7.89266e-01
## WBGene00021408 2.18657e-06
## WBGene00021405 NA
## WBGene00021409 NA
## WBGene00021404 4.30464e-08
Write results output file.
res_df <- as.data.frame(res)
# write.csv(x = res_df, "./200511_L1_intestine_FACS_gut_vs_gutless.csv", quote = FALSE)
ma_gut_vs_gutless <- plotMA(res, ylim = c(-11,11), alpha = 0.05)
# pdf(file = "./EmbryoFACS_MA_plot_gut_vs_gutless_200930.pdf", 5, 5)
# plotMA(res, ylim = c(-11,11), alpha = 0.05)
# dev.off()
resLFC <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resApeglm <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow = c(1,3), mar = c(4,4,2,1))
plotMA(resLFC, ylim=c(-10,15), main = "normal")
plotMA(resApeglm, ylim=c(-10,15), main = "apeglm")
plotMA(resAsh, ylim=c(-10,15), main = "ashr")
# write.csv(resApeglm %>% as.data.frame() %>% rownames_to_column(var = "WBGeneID"), file = "./200531_res_gut_vs_gutless_apeglmShrink.csv")
Export the plot
# pdf(file = "./200707_L1FACS_Gut_vs_Nongut_MAplot.pdf", height = 5, width = 5)
# plotMA(resApeglm, ylim=c(-10,10), main = "L1 FACS Gut vs Nongut Differential Expression")
# dev.off()
resApeglm.df <- as.data.frame(resApeglm) %>% rownames_to_column(var = "WBGeneID")
# write_csv(resApeglm.df, file = "../03_output/DE_Results_GFPplus-vs-GFPminus_apeglmShrink_220202.csv", col_names = TRUE)
rld <- rlog(dds)
rld.df <- as.data.frame(assay(rld)) %>% rownames_to_column(var = "WBGeneID")
# write_csv(rld.df, file = "../03_output/Embryo_Intestine_Rlog_Counts_220202.csv", col_names = TRUE)
vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- vsd$names
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
Export the plot
# pdf(file = "../03_output/EmbryoFACS_Sample_Distance_Matrix_200930.pdf", height = 4, width = 6)
# pheatmap(sampleDistMatrix,
# clustering_distance_rows = sampleDists,
# clustering_distance_cols = sampleDists,
# col = colors)
# dev.off()
plotDispEsts(dds)
select_rows <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("type","rep")]) %>% mutate(type = fct_recode(type), type = fct_relevel(type, c("whole", "cells", "gut", "gutless"))) %>% arrange(type)
pheatmap(assay(rld)[select_rows,rownames(df)], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
Review automated filtering
metadata(res)$alpha
## [1] 0.1
metadata(res)$filterThreshold
## 13.57143%
## 2.506608
plot(metadata(res)$filterNumRej,
type = "b", ylab = "number of rejections",
xlab = "quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
par(mfrow=c(2,2),mar=c(2,2,1,1))
ylim <- c(-7,12.5)
resGA <- results(dds, contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="greaterAbs")
resLA <- results(dds, contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="lessAbs")
resG <- results(dds,contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="greater")
resL <- results(dds,contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="less")
drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()
joined_ma <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>%
rownames_to_column(var = "WBGeneID") %>%
left_join(tissue_specific_genes, by = "WBGeneID") %>%
mutate(tissue = replace_na(tissue, "other-genes")) %>%
mutate(tissue = fct_recode(tissue), tissue = fct_relevel(tissue, c("intestine", "rectum", "excretory-system", "coelomic-system", "epithelial-system", "muscular-system", "nervous-system", "reproductive-system", "other-genes")))
## Warning: Unknown levels in `f`: muscular-system
tissue_genes_totals <- joined_ma %>% mutate(status = case_when(
lfc >= 2 & isDE == TRUE ~"enriched",
lfc <= 2 & isDE == TRUE ~ "depleted",
TRUE ~ "no_sig_diff")) %>%
group_by(tissue, status) %>% summarise(genes_per_tissue = n_distinct(WBGeneID)) %>%
mutate(mean = 1, lfc = 1, isDE = NA)
## `summarise()` has grouped output by 'tissue'. You can override using the
## `.groups` argument.
ma_tissue_facet <- joined_ma %>%
ggplot(aes(x = log(mean), y = lfc, color = isDE == TRUE)) +
geom_point(data = joined_ma %>% select(-tissue), alpha = 0.01, color = "lightgrey") +
geom_point(alpha = 0.1) +
scale_color_manual(values = c("black", "red"), name = "q.value < 0.01") +
geom_hline(yintercept = c(-2,2), linetype = "dashed") +
# annotate(geom = "text", label = "420", size = 4, x = 1, y = 1) +
theme_linedraw() +
facet_wrap(~tissue) +
ylab("log2FoldChange(GFP+/GFP-)") +
xlab("log10(mean normalized read counts")
ma_tissue_facet_labeled <- ma_tissue_facet +
geom_text(data = tissue_genes_totals %>% filter(status == "enriched"),
mapping = aes(x = 6, y = 11,label = paste("enriched genes = ",genes_per_tissue)),
color = "black",
size = 3) +
geom_text(data = tissue_genes_totals %>% filter(status == "depleted"),
mapping = aes(x = 6, y = -6,label = paste("depleted genes = ",genes_per_tissue)),
color = "black",
size = 3)
ma_tissue_facet_labeled
export the plot
ggsave(plot = ma_tissue_facet_labeled, file = "../03_output/Embryo_Intestine_MA_per_organ_system_220202.pdf",
width = 8,
height = 6)
ggsave(plot = ma_tissue_facet_labeled, file = "../03_output/Embryo_Intestine_MA_per_organ_system_220202.png",
width = 8,
height = 6,
dpi = 700)
Histogram of log2FoldChange facet by tissue
joined_ma %>% filter(isDE == TRUE) %>%
ggplot(aes(x = lfc)) +
geom_density(data = joined_ma %>% select(-tissue), alpha = 0.1, color = "grey") +
geom_density(kernel = "gaussian", alpha = 0.25, aes(fill = tissue)) +
geom_vline(xintercept = c(-2,2), linetype = "dotted")+
theme_classic() +
facet_wrap(~tissue)
im <- initInterMine(mine = listMines()["WormMine"], "m197OeDbhcudmcu143d2")
constraints = setConstraints(
paths = "Gene.expressionClusters.primaryIdentifier",
operators = "=",
values = list("WBPaper00037950:intestine_embryo_enriched")
)
queryGeneIds = setQuery(
select = c(
"Gene.primaryIdentifier",
"Gene.symbol",
"Gene.expressionClusters.primaryIdentifier"
),
where = constraints
)
spencer_embryo_genes <- runQuery(im = im, qry = queryGeneIds)
head(spencer_embryo_genes)
## Gene.primaryIdentifier Gene.symbol Gene.expressionClusters.primaryIdentifier
## 1 WBGene00000005 aat-4 WBPaper00037950:intestine_embryo_enriched
## 2 WBGene00000040 aco-1 WBPaper00037950:intestine_embryo_enriched
## 3 WBGene00000067 act-5 WBPaper00037950:intestine_embryo_enriched
## 4 WBGene00000073 add-2 WBPaper00037950:intestine_embryo_enriched
## 5 WBGene00000098 air-1 WBPaper00037950:intestine_embryo_enriched
## 6 WBGene00000100 ajm-1 WBPaper00037950:intestine_embryo_enriched
spencer_ma <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>% rownames_to_column(var = "WBGeneID") %>%
mutate(spencer_embryo = WBGeneID %in% spencer_embryo_genes$Gene.primaryIdentifier)
spencer_ma %>% filter(spencer_embryo == TRUE) %>%
ggplot(aes(x = log(mean), y = lfc, color = isDE)) +
geom_point(data = spencer_ma %>% select(-spencer_embryo), alpha = 0.1, color = "grey") +
geom_point(alpha = 0.25)
Take a different approach to label a small set of known tissue specific genes
ground_truth <- readxl::read_xlsx(path = "../01_input/Tissue_Specific_Ground_Truth_RTPW.xlsx")
tissue_specific_markers <- read.csv(file = "../01_input/Tissue_Specific_Marker_Genes.tsv", header = TRUE, sep = "\t")
neuron_truth <- ground_truth %>% rowwise() %>% mutate(neuron_count = sum(c_across(ADA:VD_DD))) %>% select(WBGeneID, gene_name, neuron_count)
neuron_truth
## # A tibble: 161 × 3
## # Rowwise:
## WBGeneID gene_name neuron_count
## <chr> <chr> <dbl>
## 1 WBGene00000437 ceh-13 8
## 2 WBGene00003102 mab-5 12
## 3 WBGene00003024 lin-39 14
## 4 WBGene00001174 egl-5 12
## 5 WBGene00003779 nob-1 6
## 6 WBGene00004024 php-3 7
## 7 WBGene00006873 vab-7 6
## 8 WBGene00003912 pal-1 0
## 9 WBGene00000428 ceh-1 2
## 10 WBGene00000434 ceh-9 7
## # … with 151 more rows
ubiquitous_genes <- neuron_truth %>% filter(neuron_count == ground_truth %>% dplyr::select(ADA:VD_DD) %>% ncol())
ubiquitous_genes
## # A tibble: 27 × 3
## # Rowwise:
## WBGeneID gene_name neuron_count
## <chr> <chr> <dbl>
## 1 WBGene00000451 ceh-30 128
## 2 WBGene00000444 ceh-21 128
## 3 WBGene00000459 ceh-38 128
## 4 WBGene00000460 ceh-39 128
## 5 WBGene00000462 ceh-41 128
## 6 WBGene00015934 ceh-48 128
## 7 WBGene00000464 ceh-44 128
## 8 WBGene00007417 ceh-58 128
## 9 WBGene00013876 ceh-74 128
## 10 WBGene00018433 ceh-82 128
## # … with 17 more rows
neuron_truth %>% inner_join(tissue_specific_markers, by = c("WBGeneID" = "ExpressionPattern.genes.primaryIdentifier"))
## # A tibble: 36 × 14
## # Rowwise:
## WBGeneID gene_name neuron_count ExpressionPattern.pat… ExpressionPatte…
## <chr> <chr> <dbl> <chr> <chr>
## 1 WBGene00001174 egl-5 12 gfp is expressed in a… Marker35
## 2 WBGene00000584 cog-1 14 Marker for ASER neuro… Marker97
## 3 WBGene00000584 cog-1 14 Marker for PDA cell. Marker114
## 4 WBGene00006818 unc-86 31 Expressed in HSN neur… Marker79
## 5 WBGene00006818 unc-86 31 Expressed in HSN neur… Marker79
## 6 WBGene00003000 lin-11 19 Expressed in secondar… Marker17
## 7 WBGene00003000 lin-11 19 Expressed in secondar… Marker17
## 8 WBGene00003000 lin-11 19 Marker for AIZ neuron… Marker103
## 9 WBGene00003000 lin-11 19 Marker for AIZ neuron… Marker103
## 10 WBGene00003000 lin-11 19 Marker for secondary … Marker77
## # … with 26 more rows, and 9 more variables: ExpressionPattern.remark <chr>,
## # ExpressionPattern.reporterGene <chr>,
## # ExpressionPattern.subcellularLocalization <chr>,
## # ExpressionPattern.anatomyTerms.name <chr>,
## # ExpressionPattern.anatomyTerms.synonym <chr>,
## # ExpressionPattern.anatomyTerms.definition <chr>,
## # ExpressionPattern.anatomyTerms.primaryIdentifier <chr>, …
germline_genes <- data.frame(WBGeneID = c("WBGene00001598", "WBGene00003993", "WBGene00003992", "WBGene00010492"), gene_name = c("glh-1", "pgl-2", "pgl-1", "meg-1"))
germline_genes$tissue <- "germline"
ground_truth_markers <- ground_truth %>% mutate(tissue= case_when(
gene_name == "elt-2" ~ "intestine",
WBGeneID %in% (neuron_truth %>% inner_join(tissue_specific_markers, by = c("WBGeneID" = "ExpressionPattern.genes.primaryIdentifier")))$WBGeneID ~ "neuron",
Intestine == 1 & Hypodermis == 0 & Muscle == 0 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "intestine",
Intestine == 0 & Hypodermis == 1 & Muscle == 0 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "hypodermis",
Intestine == 0 & Hypodermis == 0 & Muscle == 1 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "muscle",
Intestine == 0 & Hypodermis == 0 & Muscle == 0 & Germline == 1 & Pharynx == 0 & Neuron == 0 ~ "germline",
Intestine == 0 & Hypodermis == 0 & Muscle == 0 & Germline == 0 & Pharynx == 1 & Neuron == 0 ~ "pharynx"
)) %>%
dplyr::select(!(ADA:VD_DD)) %>%
dplyr::select(!(Intestine:Neuron)) %>%
drop_na(tissue) %>%
bind_rows(germline_genes)
ground_truth_markers$tissue <- factor(ground_truth_markers$tissue, levels = c("intestine", "hypodermis", "germline", "pharynx", "muscle", "neuron"))
tissues <- c("intestine", "hypodermis", "germline", "pharynx", "muscle", "neuron")
ordered_genes <- c()
for(i in tissues){
genes <- (ground_truth_markers %>% filter(tissue == i))$gene_name
ordered_genes <- append(ordered_genes, genes)
}
ordered_genes
## [1] "ifb-2" "act-5" "elt-2" "ges-1" "lon-3" "bli-1" "glh-1" "pgl-2"
## [9] "pgl-1" "meg-1" "myo-2" "hlh-1" "fhod-1" "egl-5" "cog-1" "unc-86"
## [17] "lin-11" "ttx-3" "lim-6" "unc-4" "ceh-10" "ceh-36" "sng-1" "rgef-1"
## [25] "myo-3"
fct_count(ground_truth_markers$tissue)
## # A tibble: 6 × 2
## f n
## <fct> <int>
## 1 intestine 4
## 2 hypodermis 2
## 3 germline 4
## 4 pharynx 1
## 5 muscle 2
## 6 neuron 12
joined_ma_markers <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>% rownames_to_column(var = "WBGeneID") %>% left_join(ground_truth_markers, by = "WBGeneID") %>% drop_na(tissue)
# mutate(tissue = replace_na(tissue, "not_specific"))
joined_ma_markers %>% #filter(isDE == TRUE) %>%
ggplot(aes(x = log(mean), y = lfc, color = tissue)) +
geom_point(data = joined_ma %>% select(-tissue), alpha = 0.1, color = "grey") +
geom_point(alpha = 0.5) +
geom_hline(yintercept = c(-2,2), linetype = "dotted")+
theme_classic() +
facet_wrap(~tissue)